library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: lme4
## 
## arm (Version 1.14-4, built: 2024-4-1)
## 
## Working directory is /Users/Holden/Library/CloudStorage/Dropbox/My Mac (Holden’s MacBook Pro)/Desktop/dev/school/ocn_683
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## 
## The following object is masked from 'package:arm':
## 
##     logit
library(MASS)

acartia <- read_csv('data/dam_acartia_survival_subset.csv')
## New names:
## Rows: 91 Columns: 11
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (11): ...1, Generation, Treatment, Temp, pH, Rep, Beak, time, nx, lx, ni
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
yellow_tang <- read_csv('data/yellowtang.csv')
## New names:
## Rows: 1373 Columns: 60
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (21): commonname, region_name, island, site, reef_zone, depth_bin, dive... dbl
## (36): ...1, X, latitude, longitude, sitevisitid, obs_year, photographer... lgl
## (1): other_type dttm (2): time, date_
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

#1. Histogram of the data - use discrete.histogram()

# histogram for nx, this is the data of interest
discrete.histogram(acartia$nx)

#2. mean and variance of number of survivors - mean = 17.38 - var = 58.11

mean(acartia$nx)
## [1] 17.38462
var(acartia$nx)
## [1] 58.10598

what should the variance be, given the observed mean, if distribution is binomial? - var should be 19.28 if data is binomially distributed

# var = np(1-p)

# p, prob of success
p <- (mean(acartia$nx) / 25)
  
# n, number of trials
n <- nrow(acartia)

# var calc
n*p*(1-p)
## [1] 19.27606

#3. rbinom() to simulate draws from dist with same prob of survival, same length

# here's how we do it once
random_bugs <- rbinom(n, 25, p)

random_bugs %>%
  discrete.histogram() %>%
  mean() %>%
  var()
## Warning in mean.default(.): argument is not numeric or logical: returning NA

## [1] NA

repeat this 4 more times

# loop through 5 different draws, printing histogram, mean, var

set.seed(123) 

for (i in 1:5) {
  random_bugs <- rbinom(n, 25, p)
  
  # plot discrete histogram
  discrete.histogram(random_bugs)
  
  # calculate mean and variance
  mean_value <- mean(random_bugs)
  var_value <- var(random_bugs)
  
  # print results
  cat("\nIteration", i, ":\n")
  cat("Mean:", mean_value, "\n")
  cat("Variance:", var_value, "\n\n")
  
  Sys.sleep(1)
}

## 
## Iteration 1 :
## Mean: 17.36264 
## Variance: 5.4337

## 
## Iteration 2 :
## Mean: 17.38462 
## Variance: 3.77265

## 
## Iteration 3 :
## Mean: 17.26374 
## Variance: 4.907448

## 
## Iteration 4 :
## Mean: 17.49451 
## Variance: 6.252747

## 
## Iteration 5 :
## Mean: 17.47253 
## Variance: 5.252015

how and why do simulated and observed data differ? - observed data has a much more skewed dist - it is zero-inflated - don’t see any 0’s in the simulated data - the observed data appears to be bimodal - peak at 0, and much higher survival for those that do survive (peaking at roughly 19) - some variation in simulated output, but - possible (but exceedingly unlikely) that sample size is too low if observed data truly follows the binomial dist, maybe would see more similarity with more replication - most likely is that the observed data does not follow the binomial dist! seems like there has to be some way to manage this zero-inflation….

#4. beta binomial - allows for prob of success to vary across trials

# using same n, p as above

beta_random_bugs <- rbetabinom(n, 25, p, rho = 0)

make a loop, create histograms across full possible range of rho (0:1)

set.seed(123)

rho_values <- seq(0, 1, length.out = 10)  # generate 10 rho values from 0 to 1

simulate_beta_binomial <- function(rho_values) {
  for (rho in rho_values) {
  beta_random_bugs <- rbetabinom(n, 25, p, rho) 
  
  # plot discrete histogram
  discrete.histogram(beta_random_bugs, xlab = "Count", 
                     main = paste("Histogram for rho =", round(rho, 2)))
  
  # calculate mean and variance
  mean_value <- mean(beta_random_bugs)
  var_value <- var(beta_random_bugs)
  
  # print results
  cat("\nRho =", round(rho, 2), ":\n")
  cat("Mean:", mean_value, "\n")
  cat("Variance:", var_value, "\n\n")
  
  Sys.sleep(1)
  }
}

simulate_beta_binomial(rho_values)

## 
## Rho = 0 :
## Mean: 17.36264 
## Variance: 5.4337

## 
## Rho = 0.11 :
## Mean: 17.9011 
## Variance: 11.80122

## 
## Rho = 0.22 :
## Mean: 17.20879 
## Variance: 35.14481

## 
## Rho = 0.33 :
## Mean: 17.52747 
## Variance: 45.05201

## 
## Rho = 0.44 :
## Mean: 17.20879 
## Variance: 63.05592

## 
## Rho = 0.56 :
## Mean: 17.49451 
## Variance: 75.38608

## 
## Rho = 0.67 :
## Mean: 18.10989 
## Variance: 87.05446

## 
## Rho = 0.78 :
## Mean: 18.31868 
## Variance: 94.8862

## 
## Rho = 0.89 :
## Mean: 18.78022 
## Variance: 103.5956

## 
## Rho = 1 :
## Mean: 14.56044 
## Variance: 153.6935

visually inspect, seems like between .3 to 1.0 is more promising, loop here

rho_values <- seq(.3, 1, length.out = 10)

simulate_beta_binomial(rho_values)

## 
## Rho = 0.3 :
## Mean: 17.83516 
## Variance: 52.51697

## 
## Rho = 0.38 :
## Mean: 18.02198 
## Variance: 56.17729

## 
## Rho = 0.46 :
## Mean: 15.86813 
## Variance: 71.02686

## 
## Rho = 0.53 :
## Mean: 16.03297 
## Variance: 80.47668

## 
## Rho = 0.61 :
## Mean: 18.42857 
## Variance: 75.46984

## 
## Rho = 0.69 :
## Mean: 16.49451 
## Variance: 87.16386

## 
## Rho = 0.77 :
## Mean: 16.47253 
## Variance: 112.9631

## 
## Rho = 0.84 :
## Mean: 15.69231 
## Variance: 136.3043

## 
## Rho = 0.92 :
## Mean: 17.35165 
## Variance: 120.7416

## 
## Rho = 1 :
## Mean: 11.26374 
## Variance: 156.4408

again, 0.53 to 0.84

rho_values <- seq(.53, .84, length.out = 10)

simulate_beta_binomial(rho_values)

## 
## Rho = 0.53 :
## Mean: 18.62637 
## Variance: 73.9033

## 
## Rho = 0.56 :
## Mean: 15.27473 
## Variance: 87.1348

## 
## Rho = 0.6 :
## Mean: 17.79121 
## Variance: 77.32259

## 
## Rho = 0.63 :
## Mean: 18.49451 
## Variance: 78.85275

## 
## Rho = 0.67 :
## Mean: 18.64835 
## Variance: 75.23053

## 
## Rho = 0.7 :
## Mean: 18.45055 
## Variance: 82.67253

## 
## Rho = 0.74 :
## Mean: 17.48352 
## Variance: 93.31917

## 
## Rho = 0.77 :
## Mean: 15.92308 
## Variance: 122.5607

## 
## Rho = 0.81 :
## Mean: 16.65934 
## Variance: 110.8049

## 
## Rho = 0.84 :
## Mean: 16.79121 
## Variance: 119.7448

try 0.77 to 0.9? Honestly not seeing any of these look super close

rho_values <- seq(.77, 0.9, length.out = 10)

simulate_beta_binomial(rho_values)

## 
## Rho = 0.77 :
## Mean: 16.12088 
## Variance: 107.8408

## 
## Rho = 0.78 :
## Mean: 16.35165 
## Variance: 109.4305

## 
## Rho = 0.8 :
## Mean: 16.7033 
## Variance: 114.3888

## 
## Rho = 0.81 :
## Mean: 18.74725 
## Variance: 98.36874

## 
## Rho = 0.83 :
## Mean: 15.53846 
## Variance: 123.4735

## 
## Rho = 0.84 :
## Mean: 17.52747 
## Variance: 111.1631

## 
## Rho = 0.86 :
## Mean: 18.92308 
## Variance: 100.1829

## 
## Rho = 0.87 :
## Mean: 17.96703 
## Variance: 112.5656

## 
## Rho = 0.89 :
## Mean: 18.79121 
## Variance: 100.6337

## 
## Rho = 0.9 :
## Mean: 18.42857 
## Variance: 106.5143

hmm, well if I have to pick one, I guess let’s go with rho = 0.67? - but again, none of these appropriately mirror the observed distribution in my opinion

#5. histogram of yellow tang counts

discrete.histogram(yellow_tang$count)

#6. mean var of count - mean = 5.03 - var = 35.97

mean(yellow_tang$count)
## [1] 5.026948
var(yellow_tang$count)
## [1] 35.96793

var if Poisson distribution - in Poisson, variance = mean - so, var = 5.03 in Poisson distribution - variance is way higher! Seems like this data is zero-inflated, preventing it from fitting the Poisson distribution

#7. simulate five sets of random draws from Poisson dist - use same looping strategy as above

n <- nrow(yellow_tang)
lambda <- mean(yellow_tang$count)

set.seed(123) 

for (i in 1:5) {
  random_fish <- rpois(n, lambda)
  
  # plot discrete histogram
  discrete.histogram(random_fish)
  
  # calculate mean and variance
  mean_value <- mean(random_fish)
  var_value <- var(random_fish)
  
  # print results
  cat("\nIteration", i, ":\n")
  cat("Mean:", mean_value, "\n")
  cat("Variance:", var_value, "\n\n")
  
  Sys.sleep(1)
}

## 
## Iteration 1 :
## Mean: 4.962127 
## Variance: 4.800314

## 
## Iteration 2 :
## Mean: 5.067007 
## Variance: 5.346819

## 
## Iteration 3 :
## Mean: 5.064093 
## Variance: 5.011924

## 
## Iteration 4 :
## Mean: 4.92571 
## Variance: 4.707305

## 
## Iteration 5 :
## Mean: 5.059723 
## Variance: 4.882728

how are these distributions different? - these simulated distributions more closely resemble the normal dist (maybe b/c n is so high?) - in the simulated distributions we lose the abundance of 0,1,2 count instances, and the tail of really large counts (up to 52!) - the simulated distributions appear to closely resemble the normal distribution, with a slight upward tail only up to 16

#8. trial and error parameter fitting from negative binomial dist - can model counts with more variability than Poisson dist. - use same strategy as above

n <- nrow(yellow_tang)
mu <- mean(yellow_tang$count)

set.seed(123)

theta_values <- seq(0.1, 1000, length.out = 10) # generate 10 theta values from 0 to 1000

simulate_neg_binomial <- function(theta_values) {
  for (theta in theta_values) {
  beta_random_fish <- rnegbin(n, mu, theta) 
  
  # plot discrete histogram
  discrete.histogram(beta_random_fish, xlab = "Count", 
                     main = paste("Histogram for theta =", round(theta, 2)))
  
  # calculate mean and variance
  mean_value <- mean(beta_random_fish)
  var_value <- var(beta_random_fish)
  
  # print results
  cat("\nTheta =", round(theta, 2), ":\n")
  cat("Mean:", mean_value, "\n")
  cat("Variance:", var_value, "\n\n")
  
  Sys.sleep(1)
  }
}

# start with .1 - 1000
simulate_neg_binomial(theta_values)

## 
## Theta = 0.1 :
## Mean: 5.01311 
## Variance: 226.4998

## 
## Theta = 111.2 :
## Mean: 5.002913 
## Variance: 4.84693

## 
## Theta = 222.3 :
## Mean: 4.941005 
## Variance: 4.8471

## 
## Theta = 333.4 :
## Mean: 4.941005 
## Variance: 5.132814

## 
## Theta = 444.5 :
## Mean: 5.010197 
## Variance: 5.052374

## 
## Theta = 555.6 :
## Mean: 4.922797 
## Variance: 5.020274

## 
## Theta = 666.7 :
## Mean: 4.958485 
## Variance: 5.172474

## 
## Theta = 777.8 :
## Mean: 5.038602 
## Variance: 5.147926

## 
## Theta = 888.9 :
## Mean: 5.003642 
## Variance: 4.493427

## 
## Theta = 1000 :
## Mean: 5.060452 
## Variance: 5.091824

okay, awesome! clearly, theta needs to be a very small number, good to know! - let’s run again starting at 0.01 and going to 1, see how this goes

# now run with .01 - 1
theta_values <- seq(0.01, 1, length.out = 10)
simulate_neg_binomial(theta_values)

## 
## Theta = 0.01 :
## Mean: 8.784414 
## Variance: 6155.331

## 
## Theta = 0.12 :
## Mean: 5.927167 
## Variance: 307.6754

## 
## Theta = 0.23 :
## Mean: 5.00874 
## Variance: 111.2288

## 
## Theta = 0.34 :
## Mean: 4.959213 
## Variance: 80.57559

## 
## Theta = 0.45 :
## Mean: 5.086672 
## Variance: 64.22645

## 
## Theta = 0.56 :
## Mean: 4.855062 
## Variance: 44.3937

## 
## Theta = 0.67 :
## Mean: 5.101238 
## Variance: 43.19018

## 
## Theta = 0.78 :
## Mean: 5.121631 
## Variance: 33.45385

## 
## Theta = 0.89 :
## Mean: 5.241806 
## Variance: 37.11204

## 
## Theta = 1 :
## Mean: 5.057538 
## Variance: 28.24523

super cool, 0.78 seems to really closely approximate observed dist. - further refine

# refine with .6 - .9
theta_values <- seq(0.6, 0.9, length.out = 10)
simulate_neg_binomial(theta_values)

## 
## Theta = 0.6 :
## Mean: 4.93445 
## Variance: 45.50153

## 
## Theta = 0.63 :
## Mean: 4.9563 
## Variance: 43.8873

## 
## Theta = 0.67 :
## Mean: 4.92571 
## Variance: 41.01197

## 
## Theta = 0.7 :
## Mean: 4.996358 
## Variance: 40.25728

## 
## Theta = 0.73 :
## Mean: 5.28405 
## Variance: 44.42946

## 
## Theta = 0.77 :
## Mean: 5.13984 
## Variance: 39.53291

## 
## Theta = 0.8 :
## Mean: 5.182083 
## Variance: 40.97848

## 
## Theta = 0.83 :
## Mean: 4.79898 
## Variance: 30.54265

## 
## Theta = 0.87 :
## Mean: 5.350328 
## Variance: 37.14759

## 
## Theta = 0.9 :
## Mean: 5.040787 
## Variance: 33.86277

my final choice is theta = 0.73 - I like that it appropriately models the abundance of low count observations, and includes a tail for high count values that is close to the observed max of ~50